library(tidyverse)
## -- Attaching packages -------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.3     v dplyr   1.0.2
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## -- Conflicts ----------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(sf)
## Warning: package 'sf' was built under R version 4.0.3
## Linking to GEOS 3.8.0, GDAL 3.0.4, PROJ 6.3.1
library(tmap)
## Warning: package 'tmap' was built under R version 4.0.3
library(plotly)
## Warning: package 'plotly' was built under R version 4.0.3
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(readxl)
library(leaflet)
## Warning: package 'leaflet' was built under R version 4.0.3
library(ggrepel)
## Warning: package 'ggrepel' was built under R version 4.0.3
library(scales)
## Warning: package 'scales' was built under R version 4.0.3
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
getwd()
## [1] "C:/Users/Greg_Dills/Desktop/School/Data_Visualization/Data_Viz_Project_2/Data_Viz_Pro2/report"
fl_lakes <- read_sf("C:/Users/Greg_Dills/Desktop/School/Data_Visualization/Data_Viz_Project_2/Data_Viz_Pro2/data/Florida_Lakes/Florida_Lakes/Florida_Lakes.shp")
# Successfully read the lake shapefile

head(fl_lakes)
# Trying to better understand the data in the .zip file. Neat to see a dbf file type and what it consists of (similar to the shapefile in this case)

library(foreign)
## Warning: package 'foreign' was built under R version 4.0.3
read.dbf("C:/Users/Greg_Dills/Desktop/School/Data_Visualization/Data_Viz_Project_2/Data_Viz_Pro2/data/Florida_Lakes/Florida_Lakes/Florida_Lakes.dbf")
summary(fl_lakes)
##    PERIMETER            NAME              COUNTY             OBJECTID   
##  Min.   :    55.4   Length:4243        Length:4243        Min.   :   1  
##  1st Qu.:   708.8   Class :character   Class :character   1st Qu.:1062  
##  Median :  1363.6   Mode  :character   Mode  :character   Median :2122  
##  Mean   :  3289.1                                         Mean   :2122  
##  3rd Qu.:  2736.2                                         3rd Qu.:3182  
##  Max.   :421800.0                                         Max.   :4243  
##    SHAPEAREA            SHAPELEN                 geometry   
##  Min.   :1.840e+02   Min.   :    55.4   MULTIPOLYGON :4243  
##  1st Qu.:2.422e+04   1st Qu.:   708.8   epsg:4326    :   0  
##  Median :7.776e+04   Median :  1363.6   +proj=long...:   0  
##  Mean   :1.045e+06   Mean   :  3289.1                       
##  3rd Qu.:2.456e+05   3rd Qu.:  2736.2                       
##  Max.   :1.296e+09   Max.   :421800.0
fl_lakes %>% 
  filter(NAME == "Lake Okeechobee")
# Generated summary data by county (Lake count and average perimeter)
fl_lakes_summary <- fl_lakes %>% 
  group_by(COUNTY) %>% 
summarize(Average_Perimeter = mean(PERIMETER), Total_Lakes = n())
## `summarise()` ungrouping output (override with `.groups` argument)
head(fl_lakes_summary)
#Reviewing the summary data, ensuring that it makes sense. Palm Beach does contain the largest lake (Lake Okeechobee), thus the average perimeter makes sense. However, I am not familiar with Monroe County and intend to explore further.

fl_lakes_summary %>% 
    arrange(desc(Average_Perimeter))

Link to “The Cutoff” lake visual

# Interestingly enough, Monroe county is where the Everglades reside and many of the "lakes" in this county, I would not have considered them to be classified as lakes. For example, "The Cutoff" appears to be more like a river/estuary system. A graphical representation can be found in the link above. After reading more about the meta data, the observations can also be reservoirs.


fl_lakes %>% 
  filter(COUNTY == "MONROE")
# Creating an interactive scatter plot showing the relationship between shape area and shape length

bar_plot <- ggplot(fl_lakes_summary, aes(reorder(COUNTY, -Average_Perimeter), Average_Perimeter)) +
  geom_col(aes(fill = Average_Perimeter)) +
  scale_fill_gradient(low = "grey", 
                       high = "blue") +
  theme_minimal() + 
  theme(plot.title = element_text(size=12)) +
  theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank()) +
  labs(title = "Distribution of Average Florida Lake Size by County") 
  geom_label(aes(label = COUNTY))
## mapping: label = ~COUNTY 
## geom_label: parse = FALSE, label.padding = 0.25, label.r = 0.15, label.size = 0.25, na.rm = FALSE
## stat_identity: na.rm = FALSE
## position_identity
ggplotly(bar_plot)
# Using the sf package to plot all lake shapes within the data file. By not having any filters, we can clearly see these lakes are in Florida.

plot(st_geometry(fl_lakes))

# I would like to dig deeper and view only lakes in Polk County, FL. This is great, but without a point of reference it is difficult to know where these lakes are located.

polk_lakes <- fl_lakes %>%  filter(COUNTY == "POLK")
tm_shape(polk_lakes) + tm_fill("lightblue") + tm_borders()

# Using leaflet, I thought it would be interesting to zero in on the lake in Winter Haven, which I live on. In the future I want to expand on using leaflet and shapefiles together for topographical projections.
polk_basemap <- leaflet(polk_lakes) %>% setView(lng = -81.744296, lat = 28.023622, zoom = 14)
polk_basemap %>% addTiles()
# Having the lake data by itself is great for projections, however, it would be interesting to explore whether or not there is a relationship between the average lake perimeter (in meters) vs the estimated population for each Florida County. To do this I found data here (https://www.bebr.ufl.edu/population/data) and even though I did not tidy the data in R (I need to practice data wrangling techniques again!), I was able to quickly get the data into a manageable format in Excel.


fl_county_population <- read_excel("C:/Users/Greg_Dills/Desktop/School/Data_Visualization/Data_Viz_Project_2/Data_Viz_Pro2/data/estimates_2020_table1_tidy.xlsx")
# In order to better understand the relation between the average size of lakes and the population in Florida counties.  
head(fl_county_population) 
# Successfully joined the two tables.

fl_lakes_pop <- fl_lakes_summary %>%  left_join(fl_county_population, by="COUNTY")
head(fl_lakes_pop)
fl_lakes_pop %>% 
          filter(COUNTY == "POLK")
# Very interesting (but not surprising) results from comparing the relationship between the average lake perimeter and the anticipated 2020 population by county. By annotating the clear outliers (above 10,000 meters average lake perimeter), we can see that early on in the model as a County's population is lower, there is an increase in the average size of the lake's perimeter. Aside from other variables this makes sense, the larger the average lake size in a given county, likely there will be less land mass for people to live on. There are exceptions in Florida however. For example, Lake Okeechobee is assigned to Palm Beach County (Even though it neighbors many counties) and Palm Beach has a relatively high estimated population count.


county_lm_model <- ggplot(fl_lakes_pop, mapping = aes(x = `2020_Est_Pop`, y = Average_Perimeter, label = COUNTY)) +
         geom_point() +
  geom_label_repel(aes(label=ifelse(Average_Perimeter > 10000, COUNTY, ifelse(`2020_Est_Pop` > 2000000, COUNTY,ifelse(`2020_Est_Pop` == 715090, COUNTY,""))))) +
  labs(title = "Average Florida Lake Perimeter vs Estimated 2020 Population by County",
       x = "2020 Population",
       y = "Lake Perimeter (Meters)",
       caption = "Source: BEBR & FDEP ") +
  scale_x_continuous(labels = comma) +
  scale_y_continuous(labels = comma) +
  geom_smooth() +
  theme(plot.title = element_text(size=10)) +
  theme_minimal()
  
county_lm_model

county_model <- lm(`2020_Est_Pop` ~ Average_Perimeter, data = fl_lakes_pop)
summary(county_model)
## 
## Call:
## lm(formula = `2020_Est_Pop` ~ Average_Perimeter, data = fl_lakes_pop)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -500523 -273547 -219217   57694 2513604 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)   
## (Intercept)       270948.65   84572.93   3.204  0.00212 **
## Average_Perimeter     12.78      13.74   0.930  0.35572   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 513300 on 64 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.01334,    Adjusted R-squared:  -0.002075 
## F-statistic: 0.8654 on 1 and 64 DF,  p-value: 0.3557
# I figured it would be interesting to analyze total lakes vs population. Keeping the same annotations as the prior plot, we can see where many of the counties shifted, because they simply do not have nearly as many lakes and in the case of Palm Beach (Lake Okeechobee), there are actually very few lakes relative to other counties. Notice how the linear model's peak regression line is further along the x-axis, this is showing that even in counties which the population is higher there are plenty of lakes. Polk county is a good example with over 300 lakes and double the average county Population. Maybe this indicates that people prefer lakes and potentially living on them?

county_lm_model_total_lakes <- ggplot(fl_lakes_pop, mapping = aes(x = `2020_Est_Pop`, y = Total_Lakes, label = COUNTY)) +
         geom_point() +
  geom_label_repel(aes(label=ifelse(Average_Perimeter > 10000, COUNTY, ifelse(`2020_Est_Pop` > 2000000, COUNTY,ifelse(`2020_Est_Pop` == 715090, COUNTY,""))))) +
  labs(title = "Total Number of Lakes vs Estimated 2020 Population by County",
       x = "2020 Population",
       y = "Total Number of Lakes",
       caption = "Source: BEBR & FDEP ") +
  scale_x_continuous(labels = comma) +
  scale_y_continuous(labels = comma) +
  geom_smooth() +
  theme(plot.title = element_text(size=10)) +
  theme_minimal()
  
county_lm_model_total_lakes